library(readxl)
library(dplyr)
library(lubridate)
library(stringr)
library(ggplot2)
library(tidytext)
library(tidyverse)
library(tm)

# For @P4AHCF
p4post <- read_excel("P4AHCF_after_March_11.xlsx")
p4pre <- read_excel("P4AHCF_before_March_11.xlsx")

str(p4post)
str(p4pre)

# Set first row as column names and remove first row
colnames(p4post) <- as.character(unlist(p4post[1,]))
p4post <- p4post[-1, ]

colnames(p4pre) <- as.character(unlist(p4pre[1,]))
p4pre <- p4pre[-1, ]

# Match the column order of p4post to that of p4pre
p4post <- p4post[, names(p4pre)]

# Bind rows together
merged_data_1 <- rbind(p4pre, p4post)


# Convert the created column to date
merged_data_1$created <- as.Date(merged_data_1$created)

# Create a month variable
merged_data_1$month <- floor_date(merged_data_1$created, "month")

# Filter rows where 'text' contains 'Medicare for All' or 'hashtags' contains 'MedicareForAll'
filtered_data <- merged_data_1 %>%
  mutate(mention_MFA = ifelse(str_detect(tolower(text), "medicare for all") | 
                                str_detect(tolower(hashtags), "medicareforall"), 1, 0)) %>%
  group_by(month) %>%
  summarise(MFA_mentions = sum(mention_MFA, na.rm = TRUE))

# Remove months with no mention of 'Medicare for All'
filtered_data <- filtered_data[!is.na(filtered_data$MFA_mentions), ]


# For @PNHP

pnpost <- read_excel("PNHP_After_March_11.xlsx")
pnpre <- read_excel("PNHP_before_March_11.xlsx")

str(pnpost)
str(pnpre)

# Set first row as column names and remove first row
colnames(pnpost) <- as.character(unlist(pnpost[1,]))
pnpost <- pnpost[-1, ]

colnames(pnpre) <- as.character(unlist(pnpre[1,]))
pnpre <- pnpre[-1, ]

# Match the column order of pnpost to that of pnpre
pnpost <- pnpost[, names(pnpre)]

# Bind rows together
merged_data <- rbind(pnpre, pnpost)

# Check the structure of the merged dataframe
str(merged_data)

# Convert the created column to date
merged_data$created <- as.Date(merged_data$created)

# Create a new variable to check if the 'text' or 'hashtags' contain 'Medicare for All' or 'MedicareForAll'
merged_data <- merged_data %>%
  mutate(MFA_present = ifelse(str_detect(tolower(text), "medicare for all") | 
                                str_detect(tolower(hashtags), "medicareforall"), 1, 0))

# Group by 'created' (month) and count the number of mentions
monthly_MFA <- merged_data %>%
  group_by(month = floor_date(created, "month")) %>%
  summarize(MFA_mentions = sum(MFA_present, na.rm = TRUE))

# Create a line plot for @P4AHCF
ggplot(filtered_data, aes(x = month, y = MFA_mentions)) +
  geom_line() +
  geom_vline(xintercept = as.numeric(as.Date("2020-02-01")), linetype="dashed", color = "red", size=1.5) +
  annotate("text", x = as.Date("2020-02-01"), y = max(filtered_data$MFA_mentions), label = "COVID-19", fontface = "bold", hjust = 1.15) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") + # added this line
  labs(x = "Year", y = "Number of Mentions of 'Medicare for All'",
       title = "'Medicare for All' Mentions Over Time by @P4AHCF") + theme_minimal()

# Create a line plot for @PNHP
ggplot(monthly_MFA, aes(x = month, y = MFA_mentions)) +
  geom_line() +
  geom_vline(xintercept = as.numeric(as.Date("2020-02-01")), linetype="dashed", color = "red", size=1.5) +
  annotate("text", x = as.Date("2020-02-01"), y = max(monthly_MFA$MFA_mentions), label = "COVID-19", fontface = "bold", hjust = 1.15) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") + # added this line
  labs(x = "Year", y = "Number of Mentions of 'Medicare for All'",
       title = "'Medicare for All' Mentions Over Time in Merged Dataset for @PNHP") + theme_minimal()


# Calculate word count for each tweet
merged_data$word_count <- str_count(merged_data$text, "\\S+")
merged_data_1$word_count <- str_count(merged_data_1$text, "\\S+")

# Make sure that 'created' is of type Date
merged_data$created <- as.Date(merged_data$created)

# Create a 'month' variable
merged_data$month <- floor_date(merged_data$created, "month")

# Add a column for word count
merged_data$word_count <- str_count(merged_data$text, "\\S+")

# Compute average word count per month
average_word_count <- merged_data %>%
  group_by(month) %>%
  summarise(avg_word_count = mean(word_count, na.rm = TRUE))

average_word_count_1 <- merged_data_1 %>%
  group_by(month) %>%
  summarise(avg_word_count = mean(word_count, na.rm = TRUE))

# Create a line plot for @PNHP
ggplot(average_word_count, aes(x = month, y = avg_word_count)) +
  geom_line() +
  geom_vline(xintercept = as.numeric(as.Date("2020-02-01")), linetype="dashed", color = "red", size=1.5) +
  annotate("text", x = as.Date("2020-02-01"), y = max(average_word_count$avg_word_count), label = "COVID-19", fontface = "bold", hjust = 1.15) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  labs(x = "Year", y = "Average Word Count",
       title = "Average Word Count Per Tweet Over Time for @PNHP") + theme_minimal()


# Create a line plot for @P4AHCF
ggplot(average_word_count_1, aes(x = month, y = avg_word_count)) +
  geom_line() +
  geom_vline(xintercept = as.numeric(as.Date("2020-02-01")), linetype="dashed", color = "red", size=1.5) +
  annotate("text", x = as.Date("2020-02-01"), y = max(average_word_count_1$avg_word_count), label = "COVID-19", fontface = "bold", hjust = 1.15) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  labs(x = "Year", y = "Average Word Count",
       title = "Average Word Count Per Tweet Over Time for @P4AHCF") + theme_minimal()

# Convert 'retweets' and 'favorites' to numeric
merged_data$retweets <- as.numeric(merged_data$retweets)
merged_data$favorites <- as.numeric(merged_data$favorites)

merged_data_1$retweets <- as.numeric(merged_data_1$retweets)
merged_data_1$favorites <- as.numeric(merged_data_1$favorites)

# Compute monthly sums of 'retweets' and 'favorites' for both datasets
monthly_sums_merged <- merged_data %>%
  group_by(month) %>%
  summarise(sum_retweets = sum(retweets, na.rm = TRUE),
            sum_favorites = sum(favorites, na.rm = TRUE))

monthly_sums_merged_1 <- merged_data_1 %>%
  group_by(month) %>%
  summarise(sum_retweets = sum(retweets, na.rm = TRUE),
            sum_favorites = sum(favorites, na.rm = TRUE))

# Create line plots for @PNHP with vertical line
ggplot(monthly_sums_merged, aes(x = month)) +
  geom_line(aes(y = sum_retweets), color = "blue") +
  geom_line(aes(y = sum_favorites), color = "red") +
  geom_vline(xintercept = as.numeric(as.Date("2020-02-01")), linetype="dashed", color = "red", size=1.5) +
  annotate("text", x = as.Date("2020-02-01"), y = max(c(monthly_sums_merged$sum_retweets, monthly_sums_merged$sum_favorites)), label = "COVID-19", hjust = 1.15) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  labs(x = "Year", y = "Count", title = "Monthly Sums of Retweets (Blue) and Favorites (Red) Over Time for @PNHP") +
  theme_minimal()

# Create line plots for @P4AHCF with vertical line
ggplot(monthly_sums_merged_1, aes(x = month)) +
  geom_line(aes(y = sum_retweets), color = "blue") +
  geom_line(aes(y = sum_favorites), color = "red") +
  geom_vline(xintercept = as.numeric(as.Date("2020-02-01")), linetype="dashed", color = "red", size=1.5) +
  annotate("text", x = as.Date("2020-02-01"), y = max(c(monthly_sums_merged_1$sum_retweets, monthly_sums_merged_1$sum_favorites)), label = "COVID-19", hjust = 1.15) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  labs(x = "Year", y = "Count", title = "Monthly Sums of Retweets (Blue) and Favorites (Red) Over Time for @P4AHCF") +
  theme_minimal()

# Clean the text data
merged_data$text <- tolower(merged_data$text)
merged_data$text <- gsub('http\\S+\\s*', '', merged_data$text)
merged_data$text <- gsub('[^[:alpha:]]', ' ', merged_data$text)
merged_data$text <- gsub('\\b\\w{1,2}\\b', '', merged_data$text)
merged_data$text <- removeWords(merged_data$text, stopwords("en"))

merged_data_1$text <- tolower(merged_data_1$text)
merged_data_1$text <- gsub('http\\S+\\s*', '', merged_data_1$text)
merged_data_1$text <- gsub('[^[:alpha:]]', ' ', merged_data_1$text)
merged_data_1$text <- gsub('\\b\\w{1,2}\\b', '', merged_data_1$text)
merged_data_1$text <- removeWords(merged_data_1$text, stopwords("en"))

# Get the sentiment scores
merged_data_sentiments <- merged_data %>%
  unnest_tokens(word, text) %>%
  inner_join(get_sentiments("bing")) %>%
  count(month, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

merged_data_1_sentiments <- merged_data_1 %>%
  unnest_tokens(word, text) %>%
  inner_join(get_sentiments("bing")) %>%
  count(month, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

max_sentiment <- max(c(merged_data_sentiments$sentiment, merged_data_1_sentiments$sentiment), na.rm = TRUE)

ggplot() +
  geom_line(data = merged_data_sentiments, aes(x = month, y = sentiment, color = "PNHP")) +
  geom_line(data = merged_data_1_sentiments, aes(x = month, y = sentiment, color = "P4AHCF")) +
  geom_vline(xintercept = as.Date("2020-02-01"), linetype="dashed", color = "red", size=1.5) +
  geom_text(aes(x = as.Date("2020-02-01"), y = max_sentiment, label = "COVID-19"), hjust = 1.1, fontface = "bold", color="black") +
  labs(x = "Month", y = "Sentiment Score", color = "Twitter Account") +
  ggtitle("Sentiment Analysis of Tweets Over Time using the 'bing' Lexicon" ) +
  theme_minimal()

# Install the language model
install.koRpus.lang("en")

# Load the necessary libraries
library(koRpus)
library(stringi)

# Define a function to calculate Gunning Fog index
calculate_gunning_fog <- function(text) {
  fog <- tryCatch({
    text_obj <- tokenize(text, lang = "en")
    readability(text_obj, measure = "GF")
  }, error = function(e) NA)
  return(fog$score)
}

# Calculate the Gunning Fog index for each text
merged_data$gunning_fog <- sapply(merged_data$text, calculate_gunning_fog)
merged_data_1$gunning_fog <- sapply(merged_data_1$text, calculate_gunning_fog)

# Group the data by month and calculate average Gunning Fog index
merged_data$month <- as.Date(format(as.Date(merged_data$created_at), "%Y-%m-01"))
merged_data_1$month <- as.Date(format(as.Date(merged_data_1$created_at), "%Y-%m-01"))

merged_data_monthly <- aggregate(merged_data$gunning_fog ~ merged_data$month, FUN = mean, na.rm = TRUE)
merged_data_1_monthly <- aggregate(merged_data_1$gunning_fog ~ merged_data_1$month, FUN = mean, na.rm = TRUE)

# Plot the Gunning Fog index over time for both datasets
ggplot() +
  geom_line(data = merged_data_monthly, aes(x = merged_data.month, y = merged_data$gunning_fog, color = "PNHP")) +
  geom_line(data = merged_data_1_monthly, aes(x = merged_data_1.month, y = merged_data_1$gunning_fog, color = "P4AHCF")) +
  labs(x = "Month", y = "Average Gunning Fog Index", color = "Screen Name") +
  ggtitle("Average Gunning Fog Index of Tweets Over Time") +
  theme_minimal()




